# Set the directory where your CSV files are located
data_folder <- "data/mocap_data_prepped"
# List all CSV files in the specified folder
csv_files <- list.files(data_folder, pattern = "*.csv", full.names = TRUE)
# Initialize an empty list to store the dataframes
list_of_dataframes <- list()
# Loop through each CSV file, read it into a dataframe, and store it in the list with its original file name
for (csv_file in csv_files) {
# Extract the file name without the extension
file_name <- tools::file_path_sans_ext(basename(csv_file))
# Read the CSV file into a dataframe and assign it to the list with the file name as the list name
list_of_dataframes[[file_name]] <- read.csv(csv_file)
}
In our preprocessing we want to cover the following:
We only want numeric values in our set. Therefore we convert to a wide format where each combination of marker and subject has their own xyz coordinates. We write a function to do this for us.
convert_to_wide_format <- function(df) {
df %>%
unite("marker_subject", marker, subject, remove = TRUE) %>%
pivot_wider(
names_from = marker_subject,
values_from = c(x, y, z)
)
}
Then we apply this function to each dataframe in the list_of_dataframes
list_of_dataframes <- lapply(list_of_dataframes, convert_to_wide_format)
Now we effectively have dataframes where each rows represents their own time-point with a row every .00300 seconds. We proceed with removing the columns that do not represent an x,y and z.
for (i in seq_along(list_of_dataframes)) {
list_of_dataframes[[i]] <- list_of_dataframes[[i]] %>%
select(-index, -condition, -group, -elapsed_time)
}
rm(i)
For each time we see an NA, we add the last known value for each column. This method is called “last observation carries forward”. If there are no last known values, we fill the NAs with the next known value (next observation carries backwards).We start by making a function:
library(tidyr)
fill_NAs_with_LOCF_NOCB <- function(df) {
# Apply Last Observation Carried Forward (LOCF)
df_filled_locf <- apply(df, 2, function(x) {
if (all(is.na(x))) {
return(x)
} else {
return(na.locf(x, na.rm = FALSE))
}
})
# Convert back to dataframe
df_filled_locf <- as.data.frame(df_filled_locf)
# Apply Next Observation Carried Backward (NOCB)
df_filled_locf_nocb <- apply(df_filled_locf, 2, function(x) {
if (all(is.na(x))) {
return(x)
} else {
return(na.locf(x, fromLast = TRUE, na.rm = FALSE))
}
})
# Convert back to dataframe and return
return(as.data.frame(df_filled_locf_nocb))
}
list_of_dataframes <- lapply(list_of_dataframes, fill_NAs_with_LOCF_NOCB)
#quick na check
sapply(list_of_dataframes, function(df) any(is.na(df)))
## group0_jointlead group0_leadfollow group1_jointlead group1_leadfollow
## FALSE FALSE FALSE FALSE
## group12_jointlead group2_jointlead group3_jointlead group3_leadfollow
## FALSE FALSE FALSE FALSE
## group4_leadfollow group6_jointlead group6_leadfollow group8_leadfollow
## FALSE FALSE FALSE FALSE
There are no longer any NAs, and we proceed with standardization.
Our subjects were facing eachother and mirroring eachother. We want to effectively normalise their points to simulate them being the same height (having the same skeleton) and standing in the same spot, facing the same direction.
standardize_and_normalize_dataframe <- function(df) {
# Standardize each column of the dataframe
standardized_df <- scale(df)
# Normalize the standardized dataframe using min-max scaling
min_max_normalized_df <- as.data.frame(apply(standardized_df, 2, function(x) {
(x - min(x)) / (max(x) - min(x))
}))
return(min_max_normalized_df)
}
list_of_dataframes <- lapply(list_of_dataframes, standardize_and_normalize_dataframe)
list_of_dataframes_A = list()
list_of_dataframes_B = list()
for (df_name in names(list_of_dataframes)) {
df = list_of_dataframes[[df_name]]
# Columns ending with _A
cols_A = grep("_A$", names(df), value = TRUE)
if (length(cols_A) > 0) {
list_of_dataframes_A[[paste0(df_name)]] = df[, cols_A]
}
# Columns ending with _B
cols_B = grep("_B$", names(df), value = TRUE)
if (length(cols_B) > 0) {
list_of_dataframes_B[[paste0(df_name)]] = df[, cols_B]
}
}
library(ggplot2)
library(tidyr)
# Function to create a correlation matrix plot using ggplot2
create_corr_plot <- function(corr_matrix, df_name, subject) {
# Transforming the correlation matrix into a long format
corr_data <- as.data.frame(as.table(corr_matrix))
names(corr_data) <- c("Var1", "Var2", "Correlation")
# Create the plot
ggplot(corr_data, aes(Var1, Var2, fill = Correlation)) +
geom_tile() +
scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0) +
theme_ipsum() + # Apply the theme_ipsum
labs(
title = "Correlation Matrix",
subtitle = paste(df_name, subject)
) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
coord_fixed() # Ensure a 1:1 aspect ratio
}
# Looping through list_of_dataframes_A
for (i in seq_along(list_of_dataframes_A)) {
df_name_A <- names(list_of_dataframes_A)[i]
corr_matrix_A <- cor(list_of_dataframes_A[[i]])
plot_A <- create_corr_plot(corr_matrix_A, df_name_A, "Subject A")
print(plot_A)
}
# Looping through list_of_dataframes_B
for (i in seq_along(list_of_dataframes_B)) {
df_name_B <- names(list_of_dataframes_B)[i]
corr_matrix_B <- cor(list_of_dataframes_B[[i]])
plot_B <- create_corr_plot(corr_matrix_B, df_name_B, "Subject B")
print(plot_B)
}
# Load the necessary library for PCA
library(FactoMineR)
# Create empty lists to store PCA results and principal components
pca_results_list_A <- list()
pca_results_list_B <- list()
pca_components_list_A <- list()
pca_components_list_B <- list()
# Define the number of principal components to use
num_pca_components_to_use <- 2 # You can adjust this as needed
for (i in 1:16) {
if (i <= length(list_of_dataframes_A)) {
# Extract the dataframe and its name
df_A <- list_of_dataframes_A[[i]]
df_name_A <- names(list_of_dataframes_A)[i]
# Perform PCA
pca_A <- PCA(df_A, graph = FALSE)
# Save PCA result in the list with dataframe name as key
pca_results_list_A[[df_name_A]] <- pca_A
# Extract and store principal components with dataframe name as key
pca_components_A <- pca_A$ind$coord[, 1:num_pca_components_to_use, drop = FALSE]
pca_components_list_A[[df_name_A]] <- pca_components_A
}
if (i <= length(list_of_dataframes_B)) {
# Extract the dataframe and its name
df_B <- list_of_dataframes_B[[i]]
df_name_B <- names(list_of_dataframes_B)[i]
# Perform PCA
pca_B <- PCA(df_B, graph = FALSE)
# Save PCA result in the list with dataframe name as key
pca_results_list_B[[df_name_B]] <- pca_B
# Extract and store principal components with dataframe name as key
pca_components_B <- pca_B$ind$coord[, 1:num_pca_components_to_use, drop = FALSE]
pca_components_list_B[[df_name_B]] <- pca_components_B
}
}
# Now you have two sets of lists:
# - pca_results_list_A and pca_results_list_B containing PCA results
# - pca_components_list_A and pca_components_list_B containing principal components
# with dataframe names as keys
rm(corr_matrix_A, corr_matrix_B, df, df_A, df_B, pca_A, pca_B)
library(factoextra)
library(hrbrthemes) # for theme_ipsum()
# Loop through each dataframe in list_of_dataframes
for (df_name in names(list_of_dataframes)) {
# Extract the current dataframe
g <- list_of_dataframes[[df_name]]
# Compute correlation matrix and perform PCA
corr_matrix <- cor(g)
data.pca <- princomp(corr_matrix)
# Create the scree plot
scree_plot <- fviz_eig(data.pca,
ncp = 5,
addlabels = TRUE,
barfill = global_fill_colour,
barcolor = global_fill_colour,
ggtheme = theme_ipsum()) +
ylim(0, 90) +
labs(subtitle = df_name) # Add dynamic subtitle
# Display or save the plot
print(scree_plot)
# To save: ggsave(paste0("ScreePlot_", df_name, ".png"), scree_plot)
}
Next we calculate the area between the components and pick the combination which results in the smallest area: in our case that must be the case where they are aligned. we do this to ensure there is no sign ambiguity when running PCA. We can justify doing this as the goal of the task was to mirror eachother, and it is highly unlikely any participant was doing the opposite of their partner.
# Function to calculate the area between two components, aligning their lengths
calculate_area_between_components <- function(comp1, comp2) {
# Align lengths
min_length <- min(nrow(comp1), nrow(comp2))
comp1_aligned <- comp1[1:min_length, ]
comp2_aligned <- comp2[1:min_length, ]
# Calculate area
sum(abs(comp1_aligned - comp2_aligned))
}
# Function to align a component based on the smallest area between lines
align_component_based_on_area <- function(component_A, component_B) {
# Align component lengths for area calculation
original_area <- calculate_area_between_components(component_A, component_B)
flipped_area <- calculate_area_between_components(component_A, -component_B)
if (flipped_area < original_area) {
return(-component_B)
} else {
return(component_B)
}
}
#THERE ARE ISSUERS WITH VERY LARGE VECTORS. IM TRYING TO COMBAT THAT PROBLEM HERE
batch_size <- 100 # Set a batch size that your system can handle
num_batches <- ceiling(length(pca_components_list_A) / batch_size)
for (batch in 1:num_batches) {
start_index <- (batch - 1) * batch_size + 1
end_index <- min(batch * batch_size, length(pca_components_list_A))
for (i in start_index:end_index) {
if (!is.null(pca_components_list_A[[i]]) && !is.null(pca_components_list_B[[i]])) {
pca_components_list_B[[i]] <- align_component_based_on_area(pca_components_list_A[[i]], pca_components_list_B[[i]])
}
}
# Optional: Save intermediate results to disk, clear memory, etc.
}
# Clean up
rm(list_of_dataframes_A, list_of_dataframes_B)
We can now visualise how the principle components behave, and get an idea of how synchronised they are before having formally conducted DTW.
# Loop through each index of the component lists
for (i in seq_along(pca_components_list_A)) {
# Extract components for Subject A and Subject B
component_A <- pca_components_list_A[[i]]
component_B <- pca_components_list_B[[i]]
# Ensure both components have the same number of rows
min_rows <- min(nrow(component_A), nrow(component_B))
# Combine both components into a single dataframe
combined_df <- data.frame(
time = rep(1:min_rows, 2),
value = c(component_A[1:min_rows, 1], component_B[1:min_rows, 1]),
group = rep(c("Subject A", "Subject B"), each = min_rows)
)
# Create a time series plot for both subjects
component_name_A <- names(pca_components_list_A)[i]
component_name_B <- names(pca_components_list_B)[i]
group_number <- paste(component_name_A)
p <- ggplot(combined_df, aes(x = time, y = value, color = group)) +
geom_line(size = 1) +
labs(
title = "Principle components",
subtitle = group_number,
x = "Time", y = "Component Value", color = "Subject") +
scale_color_manual(values = aesthetic_highlight_difference_palette)
# Print the plot
print(p)
}
OBS: LOOK AT ORIGINAL FOOTAGE FROM:
Group 12_leadfollow
Group 13_jointlead
Group 13_leadfollow
Group2_leadfollow
Group 6_jointlead
All components are saved in a folder to later be processed using DTW.
library(fs)
# Create the "results/PCA" directory if it doesn't already exist
dir_create(path("results", "PCA"), recursive = TRUE)
# Function to save each matrix in the component list as a CSV with an optional suffix
save_component_list_as_csv <- function(component_list, dir_path, suffix = "") {
# Ensure component_list has named indices
if (is.null(names(component_list))) {
stop("The component list must have named indices.")
}
for (comp_name in names(component_list)) {
# Use the component name for the file name, appending the suffix
file_path <- file.path(dir_path, paste0(comp_name, suffix, ".csv"))
write.csv(component_list[[comp_name]], file_path, row.names = FALSE)
}
# Print a message to indicate completion
cat("All components have been saved as CSV in the '", dir_path, "' directory.\n")
}
# Save pca_components_list_A and pca_components_list_B to "results/PCA" as CSV files
# The files from pca_components_list_A will have an '_A' suffix
save_component_list_as_csv(pca_components_list_A, "results/PCA", "_A")
## All components have been saved as CSV in the ' results/PCA ' directory.
# The files from pca_components_list_B can have no suffix or a different one (e.g., "_B")
save_component_list_as_csv(pca_components_list_B, "results/PCA", "_B")
## All components have been saved as CSV in the ' results/PCA ' directory.